- You have your clean data, now what?
- You need to understand how variables move together!
- Two main tools:
- Correlation analysis.
- Regression analysis.
August 2021
import yfinance as yf assets = ['AAPL', 'MSFT', 'AMZN', 'FB', 'GOOGL', 'GOOG', 'BRK-B', 'JPM', 'TSLA', 'JNJ'] prices = yf.download(assets, start='2000-01-01', end='2020-12-31', progress=False) prices.shape
## (5283, 60)
prices.columns.droplevel(1).drop_duplicates()
## Index(['Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume'], dtype='object')
variable_names = prices.columns.droplevel(1).drop_duplicates().to_list() variable_names.remove('Close') print(variable_names)
## ['Adj Close', 'High', 'Low', 'Open', 'Volume']
for asset_name in assets: for variable in variable_names: column_name = (variable, asset_name) del prices[column_name]
print(prices)
## Close ... ## AAPL AMZN ... MSFT TSLA ## Date ... ## 2000-01-03 0.999442 89.375000 ... 58.281250 NaN ## 2000-01-04 0.915179 81.937500 ... 56.312500 NaN ## 2000-01-05 0.928571 69.750000 ... 56.906250 NaN ## 2000-01-06 0.848214 65.562500 ... 55.000000 NaN ## 2000-01-07 0.888393 69.562500 ... 55.718750 NaN ## ... ... ... ... ... ... ## 2020-12-23 130.960007 3185.270020 ... 221.020004 645.979980 ## 2020-12-24 131.970001 3172.689941 ... 222.750000 661.770020 ## 2020-12-28 136.690002 3283.959961 ... 224.960007 663.690002 ## 2020-12-29 134.869995 3322.000000 ... 224.149994 665.989990 ## 2020-12-30 133.720001 3285.850098 ... 221.679993 694.780029 ## ## [5283 rows x 10 columns]
import matplotlib.pyplot as plt prices[('Close', 'AAPL')].plot(); plt.xlabel("Date"); plt.ylabel("Close"); plt.title("AAPL"); plt.show()
matplotlib
fig = plt.figure(); fig, axs = plt.subplots(5, 2); count = 0 for i in range(0,5): for j in range(0,2): axs[i,j].plot(prices['Close'][assets[count]]); axs[i,j].set_title(assets[count]); count = count + 1
## [<matplotlib.lines.Line2D object at 0x00000000305F79E8>] ## Text(0.5, 1.0, 'AAPL') ## [<matplotlib.lines.Line2D object at 0x000000002CBF5828>] ## Text(0.5, 1.0, 'MSFT') ## [<matplotlib.lines.Line2D object at 0x0000000030BA9860>] ## Text(0.5, 1.0, 'AMZN') ## [<matplotlib.lines.Line2D object at 0x0000000030615A90>] ## Text(0.5, 1.0, 'FB') ## [<matplotlib.lines.Line2D object at 0x00000000306CECC0>] ## Text(0.5, 1.0, 'GOOGL') ## [<matplotlib.lines.Line2D object at 0x000000003070DF60>] ## Text(0.5, 1.0, 'GOOG') ## [<matplotlib.lines.Line2D object at 0x0000000030741160>] ## Text(0.5, 1.0, 'BRK-B') ## [<matplotlib.lines.Line2D object at 0x00000000307773C8>] ## Text(0.5, 1.0, 'JPM') ## [<matplotlib.lines.Line2D object at 0x00000000307AB5C0>] ## Text(0.5, 1.0, 'TSLA') ## [<matplotlib.lines.Line2D object at 0x00000000307DE7F0>] ## Text(0.5, 1.0, 'JNJ')
plt.tight_layout();
.pct_change
returns = prices.pct_change() returns = returns.droplevel(0, axis = 1) returns
## AAPL AMZN BRK-B ... JPM MSFT TSLA ## Date ... ## 2000-01-03 NaN NaN NaN ... NaN NaN NaN ## 2000-01-04 -0.084310 -0.083217 -0.034561 ... -0.027444 -0.033780 NaN ## 2000-01-05 0.014633 -0.148741 0.016432 ... -0.006173 0.010544 NaN ## 2000-01-06 -0.086538 -0.060036 0.041571 ... 0.014197 -0.033498 NaN ## 2000-01-07 0.047369 0.061010 0.008869 ... 0.018373 0.013068 NaN ## ... ... ... ... ... ... ... ... ## 2020-12-23 -0.006976 -0.006627 0.010636 ... 0.027944 -0.013039 0.008808 ## 2020-12-24 0.007712 -0.003949 0.010212 ... -0.004398 0.007827 0.024444 ## 2020-12-28 0.035766 0.035071 0.008299 ... 0.006585 0.009921 0.002901 ## 2020-12-29 -0.013315 0.011584 0.005079 ... -0.002633 -0.003601 0.003465 ## 2020-12-30 -0.008527 -0.010882 0.000348 ... 0.002800 -0.011019 0.043229 ## ## [5283 rows x 10 columns]
import plotnine as plt9 returns_plot = returns.stack().reset_index() returns_plot.columns = ["Date", "stock", "return"] ( plt9.ggplot(returns_plot) + plt9.aes(x = "Date", y = "return", colour = "stock") + plt9.facet_wrap('stock') + plt9.geom_line() + plt9.theme(axis_text_x=plt9.element_text(angle=45), strip_text_y=plt9.element_text(size=5), subplots_adjust={'right': 0.8}) + plt9.labs(title = "Stock returns", y = "Natural daily units", x = "") )
## <ggplot: (-9223372036748728190)>
pandas
.import numpy as np round(100*np.sqrt(returns.cov()),2)
## AAPL AMZN BRK-B FB GOOG GOOGL JNJ JPM MSFT TSLA ## AAPL 2.56 1.72 0.95 1.30 1.43 1.42 0.85 1.52 1.52 1.38 ## AMZN 1.72 3.25 1.04 1.43 1.51 1.51 0.80 1.61 1.63 1.49 ## BRK-B 0.95 1.04 1.44 0.98 1.04 1.04 0.78 1.33 0.99 1.09 ## FB 1.30 1.43 0.98 2.36 1.37 1.37 0.82 1.09 1.27 1.46 ## GOOG 1.43 1.51 1.04 1.37 1.93 1.92 0.89 1.41 1.32 1.33 ## GOOGL 1.42 1.51 1.04 1.37 1.92 1.92 0.89 1.41 1.32 1.33 ## JNJ 0.85 0.80 0.78 0.82 0.89 0.89 1.24 1.01 0.92 0.83 ## JPM 1.52 1.61 1.33 1.09 1.41 1.41 1.01 2.48 1.49 1.28 ## MSFT 1.52 1.63 0.99 1.27 1.32 1.32 0.92 1.49 1.95 1.35 ## TSLA 1.38 1.49 1.09 1.46 1.33 1.33 0.83 1.28 1.35 3.55
pandas
.returns.corr().round(2)
## AAPL AMZN BRK-B FB GOOG GOOGL JNJ JPM MSFT TSLA ## AAPL 1.00 0.35 0.24 0.39 0.50 0.50 0.23 0.36 0.46 0.30 ## AMZN 0.35 1.00 0.23 0.45 0.49 0.49 0.16 0.32 0.42 0.31 ## BRK-B 0.24 0.23 1.00 0.34 0.41 0.41 0.34 0.49 0.35 0.27 ## FB 0.39 0.45 0.34 1.00 0.50 0.50 0.26 0.30 0.42 0.26 ## GOOG 0.50 0.49 0.41 0.50 1.00 1.00 0.37 0.43 0.53 0.30 ## GOOGL 0.50 0.49 0.41 0.50 1.00 1.00 0.38 0.43 0.53 0.30 ## JNJ 0.23 0.16 0.34 0.26 0.37 0.38 1.00 0.33 0.35 0.18 ## JPM 0.36 0.32 0.49 0.30 0.43 0.43 0.33 1.00 0.46 0.25 ## MSFT 0.46 0.42 0.35 0.42 0.53 0.53 0.35 0.46 1.00 0.32 ## TSLA 0.30 0.31 0.27 0.26 0.30 0.30 0.18 0.25 0.32 1.00
returns.corr(method = 'spearman').round(2)
## AAPL AMZN BRK-B FB GOOG GOOGL JNJ JPM MSFT TSLA ## AAPL 1.00 0.41 0.26 0.41 0.48 0.48 0.23 0.37 0.48 0.31 ## AMZN 0.41 1.00 0.28 0.53 0.55 0.55 0.23 0.35 0.46 0.35 ## BRK-B 0.26 0.28 1.00 0.32 0.39 0.39 0.31 0.45 0.31 0.26 ## FB 0.41 0.53 0.32 1.00 0.57 0.58 0.23 0.31 0.44 0.31 ## GOOG 0.48 0.55 0.39 0.57 1.00 1.00 0.33 0.42 0.50 0.33 ## GOOGL 0.48 0.55 0.39 0.58 1.00 1.00 0.33 0.42 0.50 0.33 ## JNJ 0.23 0.23 0.31 0.23 0.33 0.33 1.00 0.33 0.33 0.18 ## JPM 0.37 0.35 0.45 0.31 0.42 0.42 0.33 1.00 0.43 0.24 ## MSFT 0.48 0.46 0.31 0.44 0.50 0.50 0.33 0.43 1.00 0.31 ## TSLA 0.31 0.35 0.26 0.31 0.33 0.33 0.18 0.24 0.31 1.00
returns.corr(method = 'kendall').round(2)
## AAPL AMZN BRK-B FB GOOG GOOGL JNJ JPM MSFT TSLA ## AAPL 1.00 0.29 0.18 0.29 0.35 0.35 0.16 0.26 0.34 0.21 ## AMZN 0.29 1.00 0.20 0.39 0.40 0.40 0.16 0.25 0.33 0.24 ## BRK-B 0.18 0.20 1.00 0.22 0.27 0.27 0.22 0.33 0.22 0.18 ## FB 0.29 0.39 0.22 1.00 0.42 0.43 0.16 0.21 0.31 0.21 ## GOOG 0.35 0.40 0.27 0.42 1.00 0.96 0.23 0.30 0.36 0.22 ## GOOGL 0.35 0.40 0.27 0.43 0.96 1.00 0.23 0.29 0.36 0.22 ## JNJ 0.16 0.16 0.22 0.16 0.23 0.23 1.00 0.23 0.23 0.12 ## JPM 0.26 0.25 0.33 0.21 0.30 0.29 0.23 1.00 0.31 0.16 ## MSFT 0.34 0.33 0.22 0.31 0.36 0.36 0.23 0.31 1.00 0.21 ## TSLA 0.21 0.24 0.18 0.21 0.22 0.22 0.12 0.16 0.21 1.00
NA
.GOOGL
and MSFT
are observable for longer periods than FB
, computations are performed for data observed only when FB
is observable.yfinance
library, from 1980 until 2020..cov
scipy
library, and the following piece of code, compare the eigenvalues of the correlation matrices. Are they all positive?import scipy.linalg as la # CORR_MAT is your correlation matrix eigenvalues = la.eig(CORR_MAT)
plt.plot(returns['AAPL'], returns['MSFT'], '+'); plt.xlabel('AAPL'); plt.ylabel('MSFT'); plt.show();
plt.plot(returns['AAPL'], returns['MSFT'], '+'); plt.xlabel('AAPL'); plt.ylabel('MSFT'); plt.show();
statsmodel.api
library.import statsmodels.api as sm
NumPy
provides the polyfit
function for polynomial regressions.sklearn
has the LinearRegression
function.AAPL
.# Computing the market return Mkt = returns.mean(axis = 1) Mkt.name = "Market" # Adding the constant in the regression regressors = sm.add_constant(Mkt) regressors
## const Market ## Date ## 2000-01-03 1.0 NaN ## 2000-01-04 1.0 -0.049987 ## 2000-01-05 1.0 -0.017125 ## 2000-01-06 1.0 -0.015495 ## 2000-01-07 1.0 0.031871 ## ... ... ... ## 2020-12-23 1.0 0.002927 ## 2020-12-24 1.0 0.004985 ## 2020-12-28 1.0 0.018353 ## 2020-12-29 1.0 -0.001293 ## 2020-12-30 1.0 -0.001255 ## ## [5283 rows x 2 columns]
AAPL
.# Telling Python which model we are going to fit Model = sm.OLS(returns['AAPL'], regressors, missing = 'drop') # Fitting the model Fit = Model.fit() # print(Fit.summary())
from statsmodels.iolib.summary2 import summary_col # print(summary_col([Fit],stars=True))
for
loop, and store the results in a list or a dictionnary.RMarkdown
, a module of RStudio
.RStudio
is similar to Spyder
, except it’s mainly done for R
.Python
with Rstudio
.install.packages("reticulate") # Installs the module to perform Python # The installation is to be done only ONCE! library(reticulate) # Calls the package # Install Python modules py_install("numpy") py_install("pandas") py_install("matplotlib") # Starting Python repl_python()
Python
starts: knitr
package allows you to create Rmarkdown
documents.install.packages("knitr")
Rmarkdown
document.knit
#
and ##
.-
Python
, followed by the name of the chunk.echo = T
: print the source code verbatim. If FALSE
, only the output is printed.eval = F
: do not evaluate the chunk.```{python, echo = T}
import numpy as np
toto = np.array([])
```
BE CAREFUL: - When you knit the document, this re-evaluates all code chunks. - You may deactivate it chunk by chunk by using the chunk option cache = T
.
Markdown
is a book by Yihui Xie, to be found for free hereMarkdown
, you’ll have to discover it on your own!Python
universe.